* Author: Joe Tatarka
* Name: 2_main_build.do
* Purpose: Take data spend and visits merged data from 1_merge_visits_and_spend.ipynb and clean to create main_build.dta

* Set Global File Paths
global root = "T:/service_industries/replication_package"

global raw_root = "${root}/datasets/raw"
global intermediate_root = "${root}/datasets/intermediate"
global built_root = "${root}/datasets/built"
global exhibits_root = "${root}/exhibits"

********************************************************************************
*** Part 1: Load in things to merge 
********************************************************************************

************************************
*** 1.1 Bring in inflation deflators 
*************************************
**** Limited-Service Deflators
tempfile cpi_limited
import excel "${raw_root}/cpi/limited_cpi_nsa_feb_2026.xlsx", clear firstrow

rename *, lower
drop seriesid 
rename value cpi_limited

* Destring cpi variable 
replace cpi_limited = "" if cpi_limited == "-"
destring cpi_limited, replace

drop if substr(period, 1,1) == "S"

replace period = substr(period,2,2)
destring period, replace 

gen year_month_gs = ym(year, period)
format year_month_gs %tm

gen base = cpi_limited if year_month_gs == monthly("Jan2019", "MY")
egen base_jan_2019 = max(base)

replace cpi_limited = 100*(cpi_limited/base_jan_2019)

keep if year_month_gs >= monthly("jan2019", "MY") & year_month_gs <= monthly("dec2022", "MY")

keep cpi_limited year_month_gs

save `cpi_limited', replace
********************************************************************************
**** Part 2: Bring in Main Data and Merge 
********************************************************************************
**********************************
**** 2.1 Load in main data and do initial cleaning 
**********************************
import delimited "${intermediate_root}/7225_visits_and_spend_merge", clear 

*drop if no spend
drop if raw_total_spend == .

*** Keep only 50 states and DC,
* drop guam, american samoa, virgin islands, and puerto rico 
tostring poi_cbg, replace format(%17.0g)
replace poi_cbg = "0" + poi_cbg if strlen(poi_cbg) < 12

gen state_fips_code = substr(poi_cbg,1,2)

drop if state_fips_code == "66" | state_fips_code == "60" | state_fips_code == "78" | state_fips_code == "72"

* Keep branded limited service restaurants (so drop both full service and non-branded)
* we do this because branded limited service restaurants have better spend coverage
drop if naics_code == 722511 | brands == ""

compress brands

**** Drop any zeroes or wacky negatives (this drops almost entirely observations with emp == 0)
rename dwell_240_plus emp 
rename raw_total_spend nominal_spend
gen visits = dwell_less_5 + dwell_5_to_10 + dwell_11_to_20 + dwell_21_to_60 + dwell_61_to_120 + dwell_121_to_240

drop if emp <= 0 
drop if nominal_spend <= 0
drop if visits <= 0

***** Drop any POI with only single observation (need multiple obs per POI to use POI fixed effects)
bys placekey: egen count = count(placekey)
drop if count == 1
drop count

*** Turn year_month_gs variable into a numeric format
rename year_month_gs year_month_prelim
gen year_month_gs = monthly(year_month_prelim, "YM")
format year_month_gs %tm
label variable year_month_gs "Year-month"

**********************************
**** 2.2 Merge in Deflator  
**********************************
merge m:1 year_month_gs using `cpi_limited', keep(1 3) nogen

**********************************
**** 2.3 Rename, clean up, and create variables  
**********************************
* Deflate spend
gen spend = (100/cpi_limited)*nominal_spend 

*** label variables 
label var spend "Spend, Jan 2019 $"
label var nominal_spend "Raw Spend"
label var emp "Employees"
label var visits "Visits"

*** Create productivity measures 
gen ln_spend_prod = ln(spend/emp)
label var ln_spend_prod "ln(Spend Productivity)"
gen ln_visits_prod = ln(visits/emp)
label var ln_visits_prod "ln(Visits Productivity)"
gen ln_spend = ln(spend)
label var ln_spend "ln(Spend)"
gen ln_emp = ln(emp)
label var ln_emp "ln(Emp)"
gen ln_visits = ln(visits)
label var ln_visits "ln(Visits)"

* gen avg dwell time 
* multiply middle of time bucket by share of visits from that bucket
gen avg_dwell_time = (2.5*dwell_less_5 + 7.5*dwell_5_to_10 + 15*dwell_11_to_20 + 40*dwell_21_to_60 + 90*dwell_61_to_120 + 180*dwell_121_to_240)/visits
label var avg_dwell_time "Avg Dwell Time"

* dwell shares
gen dwell_0_to_10_share = (dwell_5_to_10 + dwell_less_5)/visits
label variable dwell_0_to_10_share "0-10 min Share"

foreach var in "dwell_11_to_20" "dwell_21_to_60" {
	gen `var'_share = `var'/visits
} 
label var dwell_11_to_20_share "11-20 min Share"
label var dwell_21_to_60_share "21-60 min Share"

gen dwell_61_to_240_share = (dwell_61_to_120 + dwell_121_to_240)/visits
label var dwell_61_to_240_share "61-240 min Share"

order placekey year_month_gs naics_code spend emp visits, first

**** Only keep important variables for replication 
keep placekey year_month_gs naics_code spend emp visits nominal_spend ln_spend ln_emp ln_visits ln_spend_prod ln_visits_prod avg_dwell_time dwell_0_to_10_share dwell_11_to_20_share dwell_21_to_60_share dwell_61_to_240_share brands dwell_less_5 dwell_5_to_10 dwell_11_to_20 dwell_21_to_60 dwell_61_to_120 dwell_121_to_240

sort placekey year_month_gs

**********************************
**** 2.4 Save the dataset 
**********************************
save "${built_root}/main_build.dta", replace
